Multiple Linear Regression

Red Wine Quality Prediction

Author
Affiliation

Nakul R. Padalkar

Boston University

Published

September 29, 2023

Modified

November 6, 2025

1 Setting up the environment

library(gdata)
library(ggplot2)
library(dplyr)
library(tidyr)
library(corrplot)
library(kableExtra)
library(flextable)
library(tidyverse)
library(stringr)
library(lubridate)
library(scales)
library(graphics)
library(caret)
library(psych)
library(ggcorrplot)
library(PerformanceAnalytics)

2 Importing Red Wine Quality Dataset

The goal is to model red wine quality based on physicochemical characteristics.

redDF <- read.csv("../data/winequality-red.csv",sep=",")
head(redDF) %>% kbl()
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol quality
7.4 0.70 0.00 1.9 0.076 11 34 0.9978 3.51 0.56 9.4 5
7.8 0.88 0.00 2.6 0.098 25 67 0.9968 3.20 0.68 9.8 5
7.8 0.76 0.04 2.3 0.092 15 54 0.9970 3.26 0.65 9.8 5
11.2 0.28 0.56 1.9 0.075 17 60 0.9980 3.16 0.58 9.8 6
7.4 0.70 0.00 1.9 0.076 11 34 0.9978 3.51 0.56 9.4 5
7.4 0.66 0.00 1.8 0.075 13 40 0.9978 3.51 0.56 9.4 5

3 Preparing for Machine Learning

Let’s perform the exploratory data analysis and prepare the data for machine learning. Use scatter plots, histograms, correlation values, and p-value. You can use the pairs.panels function in psych library and ggcorrplot to get pairwise correlations among variables, but I will use the PerformanceAnalytics package.

3.1 Using Standard Psych Package

library(psych)
pairs.panels(redDF)

Pairs plot of red wine dataset with psych

3.2 Using GGalley Package

library(GGally)
ggpairs(redDF, columns = c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar", "chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", "density", "pH", "sulphates", "alcohol"),
        aes(color = as.factor(quality), alpha = 0.5),
        upper = list(continuous = wrap("cor", size = 2.5)))+ scale_colour_brewer(palette = "Set3",type = 'div',direction = 1)

Pairs plot of red wine dataset with GGally

3.3 Correlation Matrix with GGcorrplot Package

ggcorrplot(redDF %>% cor(),
  hc.order = TRUE, type = "lower",
  lab = TRUE,
  digits = 2,
  ggtheme = ggplot2::theme_light(),
)

Correlation plot of red wine dataset

3.4 Correlation Matrix with base Package

chart.Correlation(redDF, hist = T)

Correlation plot of red wine dataset basic

3.5 Target variable distribution

summary(redDF$quality)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   3.000   5.000   6.000   5.636   6.000   8.000
feqtab <- as_flextable(table(redDF$quality))
feqtab

Frequency table of quality

Var1

Count

Percent

3

10

0.6%

4

53

3.3%

5

681

42.6%

6

638

39.9%

7

199

12.4%

8

18

1.1%

Total

1,599

100.0%

The dependent variable seems ordinal; hence, linear regression is unsuitable for this problem.

3.5.1 Correlation Values

Strong correlation values >0.60 between some variables. We’ll verify this again in the assumptions section.

3.6 Data Preparation

3.6.1 Checking for Missing Values

redDF %>% is.na() %>% colSums()
#>        fixed.acidity     volatile.acidity          citric.acid 
#>                    0                    0                    0 
#>       residual.sugar            chlorides  free.sulfur.dioxide 
#>                    0                    0                    0 
#> total.sulfur.dioxide              density                   pH 
#>                    0                    0                    0 
#>            sulphates              alcohol              quality 
#>                    0                    0                    0

I am using the naniar package to check for missing values and plot them.

# Load the required libraries
library(naniar)

# Create a missing value plot
vis_miss(redDF)

Missing value plot of red wine dataset

3.6.2 Checking for Outliers

Let’s perform an outlier analysis using the boxplot function. This will output the outlier numbers and will provide the boxplot for each variable.

# Specify the variable for which you want to find outliers
variable_of_interest <- redDF$fixed.acidity

# Calculate the boxplot statistics
boxplot_stats <- boxplot.stats(variable_of_interest)

# Get the indexes of outliers
outlier_indexes <- which(variable_of_interest %in% boxplot_stats$out)

Here are the outlier indexes: 206, 207, 244, 245, 265, 295, 329, 339, 340, 348, 354, 360, 364, 365, 367, 375, 382, 392, 395, 410, 430, 441, 443, 447, 471, 473, 510, 511, 517, 539, 545, 549, 555, 556, 558, 560, 561, 565, 566, 597, 600, 602, 604, 612, 653, 681, 812, 815, 1225

# Load the required libraries
library(reshape2)

melted_data <- reshape2::melt(redDF, id.vars = 'quality')

# Create a grouped box plot
ggplot(melted_data, aes(x = variable, y = value)) +
  geom_boxplot(aes(fill = as.factor(quality))) +
  labs(title = "Grouped Box Plot of Variables vs. Quality") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("Variable")

Outlier plot of red wine dataset

3.6.3 Train/test Split

set.seed(1)
sampleSize <- round(nrow(redDF)*0.8)
idx <- sample(seq_len(sampleSize), size = sampleSize)

X.train_red <- redDF[idx,]
X.test_red <- redDF[-idx,]
# X.test_red.y <- X.test_red
# X.test_red <- subset(X.test_red, select = -c(quality) )

rownames(X.train_red) <- NULL
rownames(X.test_red) <- NULL

4 Modeling

4.1 Building the Initial Model

We won’t consider ‘residual sugar’ for our analysis.

model_red1 <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + chlorides + free.sulfur.dioxide + 
                   total.sulfur.dioxide + density + pH + sulphates + alcohol, 
                 data = X.train_red)
stargazer::stargazer(model_red1, type = "html", out = "regression.html" ,title = "Simple Linear Model", ci=TRUE, single.row = FALSE, no.space = FALSE, align = TRUE, digits=3, font.size = "small",  report = "vc*stp")
Simple Linear Model
Dependent variable:
quality
fixed.acidity 0.015
(-0.037, 0.068)
t = 0.574
p = 0.567
volatile.acidity -1.057***
(-1.323, -0.790)
t = -7.780
p = 0.000
citric.acid -0.177
(-0.502, 0.147)
t = -1.070
p = 0.285
chlorides -1.779***
(-2.686, -0.872)
t = -3.845
p = 0.0002
free.sulfur.dioxide 0.003
(-0.001, 0.008)
t = 1.368
p = 0.172
total.sulfur.dioxide -0.004***
(-0.005, -0.002)
t = -4.444
p = 0.00001
density -11.025
(-49.319, 27.269)
t = -0.564
p = 0.573
pH -0.383*
(-0.777, 0.010)
t = -1.908
p = 0.057
sulphates 0.794***
(0.556, 1.033)
t = 6.527
p = 0.000
alcohol 0.292***
(0.244, 0.341)
t = 11.878
p = 0.000
Constant 15.096
(-22.377, 52.569)
t = 0.790
p = 0.430
Observations 1,279
R2 0.369
Adjusted R2 0.364
Residual Std. Error 0.648 (df = 1268)
F Statistic 74.295*** (df = 10; 1268)
Note: p<0.1; p<0.05; p<0.01

4.2 Model Assumptions

4.2.1 Checking Normality of Residuals

CheckNormal <- function(model) {
  hist(model$residuals, breaks = 30)
  shaptest <- shapiro.test(model$residuals)
  print(shaptest)
  if (shaptest$p.value <= 0.05) {
    print("H0 rejected: the residuals are NOT distributed normally")
  } else {
    print("H0 failed to reject: the residuals ARE distributed normally")
  }
}
CheckNormal(model = model_red1)

#> 
#>  Shapiro-Wilk normality test
#> 
#> data:  model$residuals
#> W = 0.9897, p-value = 7.95e-08
#> 
#> [1] "H0 rejected: the residuals are NOT distributed normally"

4.2.2 Checking Homoscedasticity

library(lmtest)
CheckHomos <- function(model){
  plot(model$fitted.values, model$residuals)
  abline(h = 0, col = "red")
  BP <- bptest(model)
  print(BP)
  if (BP$p.value <= 0.05) {
    print("H0 rejected: Error variance spreads INCONSTANTLY/generating patterns (Heteroscedasticity)")
  } else {
    print("H0 failed to reject: Error variance spreads CONSTANTLY (Homoscedasticity)")
  }
}
CheckHomos(model = model_red1)

#> 
#>  studentized Breusch-Pagan test
#> 
#> data:  model
#> BP = 56.615, df = 10, p-value = 1.575e-08
#> 
#> [1] "H0 rejected: Error variance spreads INCONSTANTLY/generating patterns (Heteroscedasticity)"

4.2.3 Checking Multicollinearity

library(car)
vif(model_red1) %>% kbl()
x
fixed.acidity 6.935469
volatile.acidity 1.779712
citric.acid 3.229450
chlorides 1.497186
free.sulfur.dioxide 1.996028
total.sulfur.dioxide 2.312040
density 4.168414
pH 2.962262
sulphates 1.368089
alcohol 2.218354

4.3 Model Improvements

4.3.1 Outlier Diagnosis

par(mfrow=c(2,2))
lapply(c(1,2,4,5), 
       function(x) plot(model_red1, 
                        which = x,
                        cook.levels = c(0.05, 0.1))) %>% invisible()

4.3.2 Removing Influential Observations and Model Rerun

to.rm <- c(78,202,245,274,1161)
X.train_red <- X.train_red[-to.rm,]
rownames(X.train_red) <- NULL
model_red2 <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + chlorides + 
                   free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, 
                 data = X.train_red)

stargazer::stargazer(model_red1,model_red2, type = "html" ,title = "Regression Models for Red Wine Dataset", ci=TRUE, single.row = FALSE, no.space = FALSE, align = TRUE, digits=5, font.size = "small",  report = "vc*stp")
Regression Models for Red Wine Dataset
Dependent variable:
quality
(1) (2)
fixed.acidity 0.01547 0.02520
(-0.03738, 0.06832) (-0.02735, 0.07775)
t = 0.57368 t = 0.93974
p = 0.56629 p = 0.34753
volatile.acidity -1.05662*** -1.03840***
(-1.32279, -0.79045) (-1.29974, -0.77706)
t = -7.78041 t = -7.78777
p = 0.00000 p = 0.00000
citric.acid -0.17735 -0.20137
(-0.50207, 0.14737) (-0.52188, 0.11914)
t = -1.07045 t = -1.23139
p = 0.28463 p = 0.21841
chlorides -1.77905*** -1.49611***
(-2.68588, -0.87222) (-2.40605, -0.58617)
t = -3.84511 t = -3.22253
p = 0.00013 p = 0.00131
free.sulfur.dioxide 0.00339 0.00391
(-0.00147, 0.00825) (-0.00088, 0.00870)
t = 1.36756 t = 1.59961
p = 0.17170 p = 0.10994
total.sulfur.dioxide -0.00364*** -0.00355***
(-0.00525, -0.00204) (-0.00514, -0.00196)
t = -4.44442 t = -4.38488
p = 0.00001 p = 0.00002
density -11.02496 -14.88229
(-49.31898, 27.26907) (-52.51200, 22.74741)
t = -0.56428 t = -0.77515
p = 0.57267 p = 0.43840
pH -0.38348* -0.38371*
(-0.77738, 0.01041) (-0.77262, 0.00521)
t = -1.90814 t = -1.93370
p = 0.05660 p = 0.05338
sulphates 0.79450*** 0.89072***
(0.55593, 1.03307) (0.64894, 1.13249)
t = 6.52710 t = 7.22053
p = 0.00000 p = 0.00000
alcohol 0.29243*** 0.29981***
(0.24418, 0.34069) (0.25229, 0.34733)
t = 11.87815 t = 12.36589
p = 0.00000 p = 0.00000
Constant 15.09573 18.68649
(-22.37738, 52.56883) (-18.12650, 55.49948)
t = 0.78956 t = 0.99489
p = 0.42994 p = 0.31999
Observations 1,279 1,274
R2 0.36945 0.38754
Adjusted R2 0.36448 0.38269
Residual Std. Error 0.64763 (df = 1268) 0.63437 (df = 1263)
F Statistic 74.29508*** (df = 10; 1268) 79.91719*** (df = 10; 1263)
Note: p<0.1; p<0.05; p<0.01

4.3.3 Model Comparison

ad.r.sq1 <- summary(model_red1)$adj.r.squared
ad.r.sq2 <- summary(model_red2)$adj.r.squared

Adjusted R-squared for 1st and the second models are 0.3644797 and 0.3826897, respectively. The difference between both is 1.821%.

5 Feature Selection

5.1 Use stepwise algorithm: Forward, Backward and Step-wise

First, let’s define the start and stop thresholds for the algorithm. We will also use the regsubset function from the leaps package to perform the stepwise regression.

model_redAlc <- lm(quality ~ alcohol, data = X.train_red)
model_redAll <- lm(quality ~ ., data = X.train_red)

5.1.1 Backward Approach

5.1.1.1 Variable Selection Sequence

The variable selection dataframe below will show which variables were forced in and/or forced out for the backward approach.

OLS.regback <- leaps::regsubsets(quality ~ ., X.train_red,
    method = "backward", nvmax = 11)
OLSregbacksum <- summary(OLS.regback)
Varselect <- data.frame(Forcein = OLSregbacksum$obj$force.in,
    Forceout = OLSregbacksum$obj$force.out)

Varselect %>%
    kbl() %>%
    kable_styling(bootstrap_options = "striped", full_width = F,
        position = "center", font_size = 9)
Forcein Forceout
TRUE FALSE
fixed.acidity FALSE FALSE
volatile.acidity FALSE FALSE
citric.acid FALSE FALSE
residual.sugar FALSE FALSE
chlorides FALSE FALSE
free.sulfur.dioxide FALSE FALSE
total.sulfur.dioxide FALSE FALSE
density FALSE FALSE
pH FALSE FALSE
sulphates FALSE FALSE
alcohol FALSE FALSE

5.1.1.2 Model Output Matrix

We also need to see the variables included in each model. The table below will show that model 1 only includes alcohol while model 2 includes volatile.acidity and alcohol.

exhselect <- data.frame(OLSregbacksum$outmat)
exhselect %>%
    kbl() %>%
    kable_styling(bootstrap_options = "striped", full_width = F,
        position = "center", font_size = 9)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
1 ( 1 ) *
2 ( 1 ) * *
3 ( 1 ) * * *
4 ( 1 ) * * * *
5 ( 1 ) * * * * *
6 ( 1 ) * * * * * *
7 ( 1 ) * * * * * * *
8 ( 1 ) * * * * * * * *
9 ( 1 ) * * * * * * * * *
10 ( 1 ) * * * * * * * * * *
11 ( 1 ) * * * * * * * * * * *

Now, let’s look at the detailed model measurement metrics, including The R^2, Adjusted-R^2, Mellow’s Cp, AIC, and BIC. These can be directly extracted from the OLS regression summary we have created in the first step.

5.1.1.3 R^2

rsqmat <- data.frame(t(OLSregbacksum$rsq))
rsqmat %>%
    kbl() %>%
    kable_styling(bootstrap_options = "striped", full_width = F,
        position = "center", font_size = 9)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
0.2596942 0.3406592 0.3608397 0.3743383 0.3799897 0.3847493 0.3865018 0.3866865 0.3871627 0.3876856 0.3884823

5.1.1.4 Adjusted R^2

adjr2mat <- data.frame(t(OLSregbacksum$adjr2))
adjr2mat %>%
    kbl() %>%
    kable_styling(bootstrap_options = "striped", full_width = F,
        position = "center", font_size = 9)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
0.2591122 0.3396217 0.3593299 0.3723662 0.3775448 0.3818357 0.3831096 0.3828078 0.3827992 0.3828375 0.3831521

5.1.1.5 Mellow’s Cp

cpmat <- data.frame(t(OLSregbacksum$cp))
cpmat %>%
    kbl() %>%
    kable_styling(bootstrap_options = "striped", full_width = F,
        position = "center", font_size = 9)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
257.7822 92.69342 53.04641 27.18921 17.52635 9.703904 8.087153 9.706003 10.7232 11.64424 12

5.1.1.6 Bayesian Information Criterion (BIC)

bicmat <- data.frame(t(OLSregbacksum$bic))
bicmat %>%
    kbl() %>%
    kable_styling(bootstrap_options = "striped", full_width = F,
        position = "center", font_size = 9)
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
-368.7818 -509.19 -541.6428 -561.6869 -566.0969 -568.7647 -565.249 -558.4827 -552.3224 -546.2598 -540.7687

5.1.1.7 Combining the Model Output Matrices

# Combine the four data frames into one
combined_df <- data.frame(
  R2 = unlist(rsqmat),
  Adj_R2 = unlist(adjr2mat),
  Mellow_Cp = unlist(cpmat),
  BIC = unlist(bicmat)
)

# Create a kable table with all the values
kable(combined_df, format = "html", col.names = c("R-sq.", "Adj R-sq.", "Mellow's Cp", "BIC")) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "center", font_size = 9)
R-sq. Adj R-sq. Mellow's Cp BIC
X1 0.2596942 0.2591122 257.782198 -368.7818
X2 0.3406592 0.3396217 92.693417 -509.1900
X3 0.3608397 0.3593299 53.046411 -541.6428
X4 0.3743383 0.3723662 27.189213 -561.6869
X5 0.3799897 0.3775448 17.526348 -566.0969
X6 0.3847493 0.3818357 9.703904 -568.7647
X7 0.3865018 0.3831096 8.087153 -565.2490
X8 0.3866865 0.3828078 9.706003 -558.4827
X9 0.3871627 0.3827992 10.723197 -552.3224
X10 0.3876856 0.3828375 11.644237 -546.2598
X11 0.3884823 0.3831521 12.000000 -540.7687

5.1.1.8 Diagnostic plots for the Backward Selection

par(mfrow = c(2, 2))
plot(OLSregbacksum$rss, xlab = "Number of Variables\n(a)", ylab = "RSS",
    type = "l", lwd = 1.5, cex.main = 1.15, cex.lab = 1, cex.axis = 1.05,
    font.axis = 2, font.lab = 2, main = "Number of Variables Vs. RSS",
    panel.first = grid(nx = NULL, ny = NULL, col = "gray", lty = 2))

plot(OLSregbacksum$adjr2, xlab = "Number of Variables\n(b)",
    ylab = "Adjusted RSq", type = "l", lwd = 1.5, main = "Number of Variables Vs. Adjusted R2",
    cex.main = 1.15, cex.lab = 1, cex.axis = 1.05, font.axis = 2,
    font.lab = 2, panel.first = grid(nx = NULL, ny = NULL, col = "gray",
        lty = 2))
points(6, OLSregbacksum$adjr2[6], col = "#336699", cex = 2, pch = 20)

plot(OLSregbacksum$cp, xlab = "Number of Variables\n(c)", ylab = "Cp",
    type = "l", lwd = 1.5, cex.main = 1.15, cex.lab = 1, cex.axis = 1.05,
    font.axis = 2, font.lab = 2, main = "Number of Variables Vs. Mellow's Cp",
    panel.first = grid(nx = NULL, ny = NULL, col = "gray", lty = 2))
points(6, OLSregbacksum$cp[6], col = "#336699", cex = 2, pch = 20)

plot(OLSregbacksum$bic, xlab = "Number of Variables\n(d)", ylab = "BIC",
    type = "l", lwd = 1.5, cex.main = 1.15, cex.lab = 1, cex.axis = 1.05,
    font.axis = 2, font.lab = 2, main = "Number of Variables Vs. BIC",
    panel.first = grid(nx = NULL, ny = NULL, col = "gray", lty = 2))
points(6, OLSregbacksum$bic[6], col = "#336699", cex = 2, pch = 20)

Backward Selection Diagnostic Plots

5.1.1.9 Best Model from Backward Selection

# Find the model with the lowest BIC value
best_model_index <- which.min(OLSregbacksum$bic)

# Get the best model
best_model <- coef(OLS.regback, id = best_model_index)

# Extract the coefficients from the best model and remove the intercept term from the coefficients
best_coefficients <- round(unname(best_model)[-1], 4)

# Get the variable names from your redDF dataframe
variable_names <- names(best_model)[-1]

# Extract the intercept coefficient separately
intercept_coefficient <- round(unname(best_model)[1], 4)

# Define the named_coefficients variable
named_coefficients <- setNames(best_coefficients, variable_names)

# Create the LaTeX equation
equation <- paste("y =", intercept_coefficient, "+", paste(named_coefficients, variable_names, sep = " * ", collapse = " + "), "")

$y = 3.9765 + -0.9888 * volatile.acidity + -1.6884 * chlorides + -0.0029 * total.sulfur.dioxide + -0.392 * pH + 0.8807 * sulphates + 0.3083 * alcohol $

5.1.2 Forward Approach

Following the leaps::regsubsets function, we will use the forward approach by changing method = "backward" to method = "forward" to select the best model. Make sure to change all OLS.regback to OLS.regfwd and related variables.

Caution: Running all three subset selection methods in one file will result in longer run times/knit times.

5.1.3 Best Subset Approach

Following the leaps::regsubsets function, we will use the best approach by changing method = "backward" to method = "best" to select the best model. Make sure to change all OLS.regback to OLS.regbest and related variables.

Caution: Running all three subset selection methods in one file will result in longer run times/knit times.

6 Check model performance

6.1 Prepare performance indicator function

# library(MLmetrics)
# indicator <- function(model, y_pred, y_true) {
#   adj.r.sq <- summary(model)$adj.r.squared
#   mse <- MSE(y_pred, y_true)
#   rmse <- RMSE(y_pred, y_true)
#   mae <- MAE(y_pred, y_true)
#   print(paste0("Adjusted R-squared: ", round(adj.r.sq, 4)))
#   print(paste0("MSE: ", round(mse, 4)))
#   print(paste0("RMSE: ", round(rmse, 4)))
#   print(paste0("MAE: ", round(mae, 4)))
# }
# 
# indicator(model = model.both_red, y_pred = model.both_red$fitted.values, y_true = X.train_red$quality)

best_model_back <-  lm(quality~ volatile.acidity + chlorides+ total.sulfur.dioxide+                 pH+            sulphates  +            alcohol, data=X.train_red)
stargazer::stargazer(best_model_back, type = "html",title = "Backward Selection Best Model", ci=TRUE, single.row = TRUE, no.space = FALSE, align = TRUE, digits=4, font.size = "small",  report = "vc*stp")
Backward Selection Best Model
Dependent variable:
quality
volatile.acidity -0.9888*** (-1.2045, -0.7731)
t = -8.9846
p = 0.0000
chlorides -1.6884*** (-2.5485, -0.8283)
t = -3.8474
p = 0.0002
total.sulfur.dioxide -0.0029*** (-0.0040, -0.0019)
t = -5.4365
p = 0.000000
pH -0.3920*** (-0.6374, -0.1466)
t = -3.1307
p = 0.0018
sulphates 0.8807*** (0.6426, 1.1188)
t = 7.2496
p = 0.0000
alcohol 0.3083*** (0.2733, 0.3433)
t = 17.2681
p = 0.0000
Constant 3.9765*** (3.1272, 4.8257)
t = 9.1773
p = 0.0000
Observations 1,274
R2 0.3847
Adjusted R2 0.3818
Residual Std. Error 0.6348 (df = 1267)
F Statistic 132.0538*** (df = 6; 1267)
Note: p<0.1; p<0.05; p<0.01

We compare these predictions from the training set to the test set.

# metrics <- function(y_pred, y_true){
#   mse <- MSE(y_pred, y_true)
#   rmse <- RMSE(y_pred, y_true)
#   mae <- MAE(y_pred, y_true)
#   corPredAct <- cor(y_pred, y_true)
#   print(paste0("MSE: ", round(mse, 6)))
#   print(paste0("RMSE: ", round(rmse, 6)))
#   print(paste0("MAE: ", round(mae, 6)))
#   print(paste0("Correlation: ", round(corPredAct, 6)))
#   print(paste0("R^2 between y_pred & y_true: ", round(corPredAct^2, 6)))
# }
# 
# metrics(y_pred = model.both_red$fitted.values, y_true = X.train_red$quality)
# 
# redPredict.both <- predict(model.both_red, newdata = X.test_red)
# metrics(y_pred = redPredict.both, y_true = X.test_red$quality)

# Predict on the test data
predictions <- predict(best_model_back, newdata = X.test_red)

6.1.1 Plot between Predicted vs. actual values in training set

# redFitted.both <- data.frame(qualityPred = model.both_red$fitted.values,
#                              qualityAct = X.train_red$quality)
# ggplot(redFitted.both, aes(x = qualityPred, y = qualityAct)) +
#   geom_point(aes(color = as.factor(qualityAct)), show.legend = F) +
#   geom_smooth(method = "lm", se = F) +
#   labs(title = "Predicted vs Actual Values Using Train Dataset", x = "Predicted quality", y = "Actual quality")

6.1.2 Plot between Predicted vs. actual values in test set

# redPredict.bothDF <- data.frame(qualityPred = redPredict.both, qualityAct = X.test_red$quality)
# ggplot(redPredict.bothDF, aes(x = qualityPred, y = qualityAct)) +
#   geom_point(aes(color = as.factor(qualityAct)), show.legend = F) +
#   geom_smooth(method = "lm", se = F) +
#   labs(title = "Predicted vs Actual Values Using Test Dataset", x = "Predicted quality", y = "Actual quality")
# Add the predicted values to your test data (X.test_red)

X.test_red$predicted_quality <- predictions

# Load the necessary libraries
library(ggplot2)

# Create a scatterplot to visualize the predicted values vs. actual values
ggplot(X.test_red, aes(x = quality, y = predicted_quality)) +
  geom_point() +
  labs(title = "Actual vs. Predicted Quality",
       x = "Actual Quality",
       y = "Predicted Quality") +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  theme_minimal()

7 Equations

7.1 Causal

\begin{align} ln(y) &= \beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2} \implies y= e^{\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2}}\\ \sqrt{y} &=\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2}\implies y = (\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2})^2== b_0^2+b_1^2+2*b1b2\\ \sqrt[3]{y} &=\beta_{0}+\beta_{1}x_{1}+\beta_{2}x_{2}\\ cnt &= 161.807 +85.5765 *temp + 314.3430 *atemp - 275.1803 *hum+ 43.000* windspeed\\ causal &= 161.807 +85.5765 *temp + 314.3430 *atemp - 275.1803 *hum+ 43.000* windspeed\\ Registerd &= 161.807 +85.5765 *temp + 314.3430 *atemp - 275.1803 *hum+ 43.000* windspeed \end{align}

lm1 = lm(BC_cnt~.-causal-registered-instant-cnt, data=train)

8 References